D:\a\csshw\csshw\xtask\src\typography.rs
Line | Count | Source |
1 | | //! Typography linter that blocks decorative or "smart" Unicode |
2 | | //! punctuation from sneaking into the repository. |
3 | | //! |
4 | | //! Agents tend to introduce em-dashes, en-dashes, smart quotes, |
5 | | //! ellipsis, arrows, and similar non-ASCII glyphs in comments and |
6 | | //! prose. They look similar to their ASCII equivalents but are not |
7 | | //! what a Windows developer types and not what `cargo fmt` produces. |
8 | | //! |
9 | | //! [`check_typography`] enumerates tracked text files via |
10 | | //! `git ls-files`, scans each for a curated blocklist of code points, |
11 | | //! prints any violations as `path:line:col U+XXXX 'glyph'`, and |
12 | | //! returns an error when at least one violation is found so the |
13 | | //! pre-commit hook and CI both abort. |
14 | | //! |
15 | | //! Performance: the scan runs inside the pre-commit hook, so the |
16 | | //! hot path reads bytes, exits early on pure-ASCII input, and only |
17 | | //! decodes UTF-8 for files that actually contain non-ASCII bytes. |
18 | | |
19 | | use std::path::{Path, PathBuf}; |
20 | | |
21 | | use anyhow::{bail, Context, Result}; |
22 | | |
23 | | /// File extensions whose contents are scanned. |
24 | | /// |
25 | | /// All matching is done in lowercase. Files with no extension are |
26 | | /// scanned only when their path matches [`SCAN_EXTRA_PATHS`]. |
27 | | const SCAN_EXTENSIONS: &[&str] = &[ |
28 | | "rs", "md", "toml", "yml", "yaml", "json", "html", "txt", "cfg", "sh", "ps1", "js", "mjs", |
29 | | ]; |
30 | | |
31 | | /// Tracked paths without a recognised extension that should still be |
32 | | /// scanned (shell scripts, hooks, etc.). Compared against the |
33 | | /// `git ls-files` output verbatim (forward slashes). |
34 | | const SCAN_EXTRA_PATHS: &[&str] = &[".githooks/pre-commit"]; |
35 | | |
36 | | /// Tracked paths that are explicitly excluded from scanning. Used for: |
37 | | /// |
38 | | /// - generated artefacts such as `Cargo.lock`, |
39 | | /// - files (such as the `CHANGELOG.md`) that may legitimately preserve |
40 | | /// historical typography from prior releases, |
41 | | /// - templates and workflow snippets whose non-ASCII content is |
42 | | /// intentional and rendered to users (e.g. social-preview titles, |
43 | | /// GitHub Pages footers, PR-comment heredocs). |
44 | | /// |
45 | | /// Keep this list short -- the goal is to fix offending content, not |
46 | | /// to allowlist around it. Compared against the `git ls-files` output |
47 | | /// verbatim (forward slashes). |
48 | | const ALLOWED_PATHS: &[&str] = &[ |
49 | | "Cargo.lock", |
50 | | ".github/workflows/news-fragment-check.yml", |
51 | | "templates/github-pages-index.html", |
52 | | "templates/social-preview.html", |
53 | | ]; |
54 | | |
55 | | /// Hard cap on file size accepted by the scanner. Anything larger is |
56 | | /// skipped with a warning -- the repo has nothing close to this size, |
57 | | /// and a pathological large file should not block a commit. |
58 | | const MAX_FILE_BYTES: u64 = 5 * 1024 * 1024; |
59 | | |
60 | | /// All side-effecting operations performed by the typography scanner. |
61 | | /// |
62 | | /// Implement with mocks in tests to achieve zero filesystem and |
63 | | /// process side-effects. |
64 | | pub trait TypographySystem { |
65 | | /// Return the list of tracked files reported by `git ls-files`. |
66 | | /// |
67 | | /// Paths are returned with forward slashes (the format `git` |
68 | | /// emits on every platform). |
69 | | /// |
70 | | /// # Errors |
71 | | /// |
72 | | /// Returns an error if the `git` process cannot be started or |
73 | | /// exits non-zero. |
74 | | fn list_tracked_files(&self) -> Result<Vec<String>>; |
75 | | |
76 | | /// Return the size in bytes of the file at `path`. |
77 | | /// |
78 | | /// # Errors |
79 | | /// |
80 | | /// Returns an error if the file cannot be stat-ed. |
81 | | fn file_size(&self, path: &Path) -> Result<u64>; |
82 | | |
83 | | /// Read the full contents of the file at `path` as raw bytes. |
84 | | /// |
85 | | /// # Errors |
86 | | /// |
87 | | /// Returns an error if the file cannot be read. |
88 | | fn read_file(&self, path: &Path) -> Result<Vec<u8>>; |
89 | | |
90 | | /// Emit a message to the user (informational or warning). |
91 | | /// |
92 | | /// # Arguments |
93 | | /// |
94 | | /// * `msg` - Message to display. |
95 | | fn log(&self, msg: &str); |
96 | | } |
97 | | |
98 | | /// Production implementation of [`TypographySystem`]. |
99 | | pub struct RealSystem; |
100 | | |
101 | | #[cfg_attr(coverage_nightly, coverage(off))] |
102 | | impl TypographySystem for RealSystem { |
103 | | fn list_tracked_files(&self) -> Result<Vec<String>> { |
104 | | let output = std::process::Command::new("git") |
105 | | .args(["ls-files"]) |
106 | | .output() |
107 | | .context("failed to run `git ls-files`")?; |
108 | | if !output.status.success() { |
109 | | bail!( |
110 | | "`git ls-files` exited non-zero: {}", |
111 | | String::from_utf8_lossy(&output.stderr) |
112 | | ); |
113 | | } |
114 | | let stdout = |
115 | | String::from_utf8(output.stdout).context("`git ls-files` produced non-UTF-8 output")?; |
116 | | Ok(stdout |
117 | | .lines() |
118 | | .filter(|line| !line.is_empty()) |
119 | | .map(|line| line.to_owned()) |
120 | | .collect()) |
121 | | } |
122 | | |
123 | | fn file_size(&self, path: &Path) -> Result<u64> { |
124 | | let meta = std::fs::metadata(path) |
125 | | .with_context(|| format!("failed to stat {}", path.display()))?; |
126 | | Ok(meta.len()) |
127 | | } |
128 | | |
129 | | fn read_file(&self, path: &Path) -> Result<Vec<u8>> { |
130 | | std::fs::read(path).with_context(|| format!("failed to read {}", path.display())) |
131 | | } |
132 | | |
133 | | fn log(&self, msg: &str) { |
134 | | eprintln!("{msg}"); |
135 | | } |
136 | | } |
137 | | |
138 | | /// A single offending code point found in a scanned file. |
139 | | #[derive(Debug, Clone, PartialEq, Eq)] |
140 | | pub struct Violation { |
141 | | /// Repository-relative path with forward slashes. |
142 | | pub path: String, |
143 | | /// 1-based line number of the offending character. |
144 | | pub line: u32, |
145 | | /// 1-based column (counted in `char`s, not bytes) of the offending |
146 | | /// character. |
147 | | pub column: u32, |
148 | | /// The offending Unicode scalar value. |
149 | | pub character: char, |
150 | | } |
151 | | |
152 | | /// Return `true` when `c` should be flagged by the scanner. |
153 | | /// |
154 | | /// The blocklist is hand-curated to cover the decorative glyphs that |
155 | | /// LLMs habitually substitute for ASCII punctuation. Emoji and other |
156 | | /// non-ASCII characters are deliberately not included. |
157 | | /// |
158 | | /// # Arguments |
159 | | /// |
160 | | /// * `c` - Character to test. |
161 | | /// |
162 | | /// # Returns |
163 | | /// |
164 | | /// `true` when `c` is on the blocklist, `false` otherwise. |
165 | 202 | pub fn is_blocklisted(c: char) -> bool { |
166 | 202 | let cp = c as u32; |
167 | 186 | matches!( |
168 | 202 | cp, |
169 | | // Non-breaking and middle-dot, multiplication, division. |
170 | | 0x00A0 | 0x00B7 | 0x00D7 | 0x00F7 |
171 | | // Exotic spaces. |
172 | 18 | | 0x2000..=0x200B |
173 | | | 0x202F | 0x205F | 0x3000 |
174 | | // Hyphens, en/em-dashes, horizontal bar, minus sign. |
175 | 18 | | 0x2010..=0x2015 | 0x2212 |
176 | | // Smart single and double quotes. |
177 | 13 | | 0x2018..=0x201F |
178 | | // Bullet, ellipsis. |
179 | | | 0x2022 | 0x2026 |
180 | | // Arrows block in its entirety. |
181 | 9 | | 0x2190..=0x21FF |
182 | | // Math comparison glyphs. |
183 | | | 0x2248 | 0x2260 | 0x2264 | 0x2265 |
184 | | ) |
185 | 202 | } |
186 | | |
187 | | /// Decide whether `path` should be scanned. |
188 | | /// |
189 | | /// A file is scanned when: |
190 | | /// |
191 | | /// 1. it is not in [`ALLOWED_PATHS`], and |
192 | | /// 2. its lowercase extension is in [`SCAN_EXTENSIONS`], or its path |
193 | | /// appears verbatim in [`SCAN_EXTRA_PATHS`]. |
194 | | /// |
195 | | /// # Arguments |
196 | | /// |
197 | | /// * `path` - Forward-slash relative path as emitted by |
198 | | /// `git ls-files`. |
199 | | /// |
200 | | /// # Returns |
201 | | /// |
202 | | /// `true` when the file should be scanned, `false` otherwise. |
203 | 21 | pub fn should_scan(path: &str) -> bool { |
204 | 21 | if ALLOWED_PATHS.contains(&path) { |
205 | 5 | return false; |
206 | 16 | } |
207 | 16 | if SCAN_EXTRA_PATHS.contains(&path) { |
208 | 1 | return true; |
209 | 15 | } |
210 | 15 | let Some(dot14 ) = path.rfind('.') else { |
211 | 1 | return false; |
212 | | }; |
213 | 14 | let ext = &path[dot + 1..]; |
214 | 14 | SCAN_EXTENSIONS |
215 | 14 | .iter() |
216 | 73 | .any14 (|allowed| allowed.eq_ignore_ascii_case(ext)) |
217 | 21 | } |
218 | | |
219 | | /// Scan a single file's contents and return any violations. |
220 | | /// |
221 | | /// Pure function -- no I/O. Files that are pure ASCII return early |
222 | | /// before allocating or decoding UTF-8, which keeps the common case |
223 | | /// (almost every `.rs` file in this repo) cheap. |
224 | | /// |
225 | | /// Files that are not valid UTF-8 are reported via the returned |
226 | | /// `non_utf8` flag and produce no violations; the caller decides |
227 | | /// whether to surface that as a warning. |
228 | | /// |
229 | | /// # Arguments |
230 | | /// |
231 | | /// * `path` - Display path used when constructing violations. |
232 | | /// * `bytes` - Raw file contents. |
233 | | /// |
234 | | /// # Returns |
235 | | /// |
236 | | /// `(violations, non_utf8)` where `non_utf8` is `true` if the file |
237 | | /// could not be decoded as UTF-8. |
238 | 10 | pub fn scan_bytes(path: &str, bytes: &[u8]) -> (Vec<Violation>, bool) { |
239 | | // Fast path: pure ASCII -> nothing to flag. |
240 | 112 | if bytes.iter()10 .all10 (|&b| b < 0x80) { |
241 | 3 | return (Vec::new(), false); |
242 | 7 | } |
243 | | |
244 | 7 | let Ok(text5 ) = std::str::from_utf8(bytes) else { |
245 | 2 | return (Vec::new(), true); |
246 | | }; |
247 | | |
248 | 5 | let mut violations = Vec::new(); |
249 | 5 | let mut line: u32 = 1; |
250 | 5 | let mut column: u32 = 1; |
251 | 64 | for c in text5 .chars5 () { |
252 | 64 | if c == '\n' { |
253 | 5 | line += 1; |
254 | 5 | column = 1; |
255 | 5 | continue; |
256 | 59 | } |
257 | 59 | if c == '\r' { |
258 | | // CRLF: do not advance the column. The following '\n' resets it. |
259 | 0 | continue; |
260 | 59 | } |
261 | 59 | if is_blocklisted(c) { |
262 | 5 | violations.push(Violation { |
263 | 5 | path: path.to_owned(), |
264 | 5 | line, |
265 | 5 | column, |
266 | 5 | character: c, |
267 | 5 | }); |
268 | 54 | } |
269 | 59 | column += 1; |
270 | | } |
271 | 5 | (violations, false) |
272 | 10 | } |
273 | | |
274 | | /// Scan every tracked text file and report violations. |
275 | | /// |
276 | | /// Reads the file list via `git ls-files`, filters it through |
277 | | /// [`should_scan`], and runs [`scan_bytes`] on each remaining file. |
278 | | /// Violations are printed to stderr as |
279 | | /// `path:line:col U+XXXX 'glyph'`. |
280 | | /// |
281 | | /// # Arguments |
282 | | /// |
283 | | /// * `system` - Injected I/O provider. |
284 | | /// |
285 | | /// # Returns |
286 | | /// |
287 | | /// `Ok(())` when no violations are found. |
288 | | /// |
289 | | /// # Errors |
290 | | /// |
291 | | /// Returns an error when at least one violation is found, or when an |
292 | | /// I/O operation fails. Files that are too large or not valid UTF-8 |
293 | | /// are skipped with a warning and do not fail the run. |
294 | 4 | pub fn check_typography<S: TypographySystem>(system: &S) -> Result<()> { |
295 | 4 | let files = system.list_tracked_files()?0 ; |
296 | 4 | let mut violations: Vec<Violation> = Vec::new(); |
297 | 5 | for rel in files4 { |
298 | 5 | if !should_scan(&rel) { |
299 | 1 | continue; |
300 | 4 | } |
301 | 4 | let path = PathBuf::from(&rel); |
302 | 4 | let size = system.file_size(&path)?0 ; |
303 | 4 | if size > MAX_FILE_BYTES { |
304 | 1 | system.log(&format!( |
305 | 1 | "WARNING - skipping {rel}: {size} bytes exceeds {MAX_FILE_BYTES} byte cap" |
306 | 1 | )); |
307 | 1 | continue; |
308 | 3 | } |
309 | 3 | let bytes = system.read_file(&path)?0 ; |
310 | 3 | let (mut found, non_utf8) = scan_bytes(&rel, &bytes); |
311 | 3 | if non_utf8 { |
312 | 1 | system.log(&format!("WARNING - skipping {rel}: not valid UTF-8")); |
313 | 1 | continue; |
314 | 2 | } |
315 | 2 | violations.append(&mut found); |
316 | | } |
317 | | |
318 | 4 | if violations.is_empty() { |
319 | 3 | println!("INFO - check-typography: no forbidden Unicode found."); |
320 | 3 | return Ok(()); |
321 | 1 | } |
322 | | |
323 | 1 | eprintln!( |
324 | | "ERROR - check-typography: found {} forbidden Unicode character(s).", |
325 | 1 | violations.len() |
326 | | ); |
327 | 1 | eprintln!(" Replace them with their ASCII equivalents (em/en-dashes -> '-',"); |
328 | 1 | eprintln!(" smart quotes -> ' or \", ellipsis -> ..., arrows -> -> / <-, etc.)."); |
329 | 1 | eprintln!(); |
330 | 1 | for v in &violations { |
331 | 1 | eprintln!( |
332 | 1 | "{}:{}:{} U+{:04X} {:?}", |
333 | 1 | v.path, v.line, v.column, v.character as u32, v.character |
334 | 1 | ); |
335 | 1 | } |
336 | 1 | bail!("found {} forbidden Unicode character(s)", violations.len()) |
337 | 4 | } |
338 | | |
339 | | #[cfg(test)] |
340 | | #[path = "tests/test_typography.rs"] |
341 | | mod tests; |